/*    Copyright 2010 Tobias Marschall
 *
 *    This file is part of MoSDi.
 *
 *    MoSDi is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    MoSDi is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with MoSDi.  If not, see <http://www.gnu.org/licenses/>.
 */

package mosdi.subcommands;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import mosdi.fa.Alphabet;
import mosdi.fa.CDFA;
import mosdi.fa.DFAFactory;
import mosdi.fa.GeneralizedString;
import mosdi.util.Iupac;
import mosdi.util.Log;
import mosdi.util.NamedSequence;
import mosdi.util.SequenceUtils;

public class CountMatchesSubcommand extends Subcommand {

	@Override
	public String usage() {
		return 
		super.usage()+" [options] <fasta-file> <iupac-pattern>\n" +
		"\n" +
		"Options:\n" +
		"  -r: output matches of reverse complementary motif\n" +
		"  -t: report matches in format required for upload at\n" +
		"      http://bio.cs.washington.edu/assessment/\n" +
		"      (See Tompa et al., Nat. Biotechnol.(23), 137-144, 2005.)\n" +
		"  -s: report matches in format required for upload at\n" +
		"      http://tare.medisin.ntnu.no/\n" +
		"      (See Sandve et al., BMC Bioinformatics(8):193, 2007.)";
	}

	@Override
	public String description() {
		return "Reports the number of matches of a given IUPAC pattern.";
	}

	@Override
	public String name() {
		return "count-matches";
	}

	@Override
	public int run(String[] args) {
		parseOptions(args, 2, "rts");

		// Option dependencies
		exclusiveOptions("s", "t");

		// Mandatory arguments
		String filename = getStringArgument(0);
		String pattern = getStringArgument(1);

		// Options
		boolean considerReverse = getBooleanOption("r", false);
		boolean tompaFormat = getBooleanOption("t", false);
		boolean sandveFormat = getBooleanOption("s", false);

		Alphabet alphabet = Alphabet.getDnaAlphabet();
		List<NamedSequence> namedSequences = null;

		try {
			namedSequences = SequenceUtils.readFastaFile(filename, alphabet);
		} catch (Exception e) {
			Log.errorln(e.toString());
			System.exit(1);
		}
		List<int[]> sequences = SequenceUtils.sequenceList(namedSequences);

		// construct automaton
		List<GeneralizedString> l = new ArrayList<GeneralizedString>(1);
		l.add(Iupac.toGeneralizedString(pattern));
		if (considerReverse) l.add(Iupac.toGeneralizedString(Iupac.reverseComplementary(pattern)));
		CDFA cdfa = DFAFactory.build(alphabet, l, 10000);
		if (tompaFormat) {
			Log.println(Log.Level.STANDARD, ">data set");
			String[] splitFilename = filename.split("\\/"); 
			Log.println(Log.Level.STANDARD, splitFilename[splitFilename.length-1]);
			Log.println(Log.Level.STANDARD, ">instances");
			int n = 0;
			for (int[] s : sequences) {
				List<CDFA.MatchPosition> matchPositions = cdfa.findMatchPositions(s);
				for (CDFA.MatchPosition mp : matchPositions) {
					int pos = mp.getPosition()-pattern.length()+1;
					Log.printf(Log.Level.STANDARD, "%d,%d,%s%n", n, pos-s.length, alphabet.buildString(Arrays.copyOfRange(s,pos, pos+pattern.length())));
				}
				n+=1;
			}
		} else if (sandveFormat) {
			for (NamedSequence ns : namedSequences) {
				List<CDFA.MatchPosition> matchPositions = cdfa.findMatchPositions( alphabet.buildString(ns.getSequence()));
				for (CDFA.MatchPosition mp : matchPositions) {
					int lastPos = mp.getPosition();
					Log.printf(Log.Level.STANDARD, "%s, %d, %d, %s%n", ns.getName(), lastPos-pattern.length()+1, lastPos, alphabet.buildString(ns.getSequence()).substring(lastPos-pattern.length()+1, lastPos+1));
				}
			}
		} else {
			int n = 0;
			int total_matches = 0;
			for (int[] s : sequences) {
				int matches = cdfa.countMatches(s);
				Log.printf(Log.Level.STANDARD, ">>sequence>> %d >>matches>> %d%n", n++, matches);
				total_matches+=matches;
			}
			Log.printf(Log.Level.STANDARD, "Total matches: %d%n", total_matches);
		}
		return 0;
	}	
}
